In previous documents, I have:
This was done very inefficiently, unorganized and for the (three) cell types separately. I want to make this more generalized, structured and plot the resulting output in a more summarized fashion.
Overall, this document will replace the documents linking the dynamic LADs to other data sets. The method will be the same, although I would like to design a different output figure to summarize all data much conveniently. The strategy is summarized below.
Various plots.
Initially, I want to prepare the input and output. Load in the libraries and set other constants.
# Load dependencies
suppressMessages(suppressWarnings(library(GenomicRanges)))
suppressMessages(suppressWarnings(library(rtracklayer)))
suppressMessages(suppressWarnings(library(ggplot2)))
suppressMessages(suppressWarnings(library(reshape2)))
suppressMessages(suppressWarnings(library(RColorBrewer)))
suppressMessages(suppressWarnings(library(ggbeeswarm)))
suppressMessages(suppressWarnings(library(limma)))
suppressMessages(suppressWarnings(library(edgeR)))
suppressMessages(suppressWarnings(library(gtools)))# The cell lines and file locations of previous analyses
cells <- c("Hap1", "HCT116", "K562")
dir.input.base <- "ts190708_differential_analysis_BinsandLADs_"
dir.padamid.base <- "/DATA/usr/t.v.schaik/proj/tests/results/ts180813_GCF5083_pADamIDtests/results/normalized/bin-20kb/"
# One output directory, corresponding with this file
dir.output <- "ts190801_dynamic_LAD_correlations_bars"
dir.create(dir.output, showWarnings = FALSE)I always appreciate having the markdown figures as png / pdf, as plotted in the document. This is set below.
Below, I will provide the data sets that will be processed for this document. These consist of (when data is available) :
# Input directories
dir.damid.base <- "/DATA/usr/t.v.schaik/proj/3D_nucleus/results/ts180110_4DN_DataProcessing/results/normalized/bin-20kb/"
dir.padamid.base <- "/DATA/usr/t.v.schaik/proj/tests/results/ts180813_GCF5083_pADamIDtests/results/normalized/bin-20kb/"
dir.repliseq.base <- "/DATA/usr/t.v.schaik/proj/tests/results/ts190301_pADamID_CellCycle/ts190731_4DN_RepliSeq/bigwigs/"
dir.chip.base <- "/DATA/usr/t.v.schaik/proj/tests/results/ts180813_GCF5083_pADamIDtests/ts190320_pADamID_correlations/histone_correlations/"
data.tracks <- list(
# Hap1
Hap1 = list(damid_lmnb1 = file.path(dir.damid.base,
"Hap1_LMNB1-20kb-combined.norm.txt.gz"),
padamid_h3k27me3 = file.path(dir.padamid.base,
"Hap1_H3K27me3-20kb-combined.norm.txt.gz"),
padamid_h3k9me3 = file.path(dir.padamid.base,
"Hap1_H3K9me3-20kb-combined.norm.txt.gz")),
# HCT116
HCT116 = list(damid_lmnb1 = file.path(dir.damid.base,
"HCT116_LMNB1-20kb-combined.norm.txt.gz"),
repliseq = file.path(dir.repliseq.base,
"HCT116_r1_20kb.bw"),
chip_h3k27me3 = file.path(dir.chip.base,
"ChIPseq-HCT116_H3K27me3-20kb.bw"),
chip_h3k9me3 = file.path(dir.chip.base,
"ChIPseq-HCT116_H3K9me3-20kb.bw")),
# K562
K562 = list(damid_lmnb1 = file.path(dir.damid.base,
"K562_LMNB1-20kb-combined.norm.txt.gz"),
repliseq = file.path(dir.repliseq.base,
"K562_r1_20kb.bw"),
chip_h3k27me3 = file.path(dir.chip.base,
"ChIPseq-K562_H3K27me3-20kb.bw"),
chip_h3k9me3 = file.path(dir.chip.base,
"ChIPseq-K562_H3K9me3-20kb.bw"),
padamid_h3k27me3 = file.path(dir.padamid.base,
"K562_H3K27me3-20kb-combined.norm.txt.gz"),
padamid_h3k9me3 = file.path(dir.padamid.base,
"pADamID-K562_r3_H3K9me3-20kb.norm.txt.gz"))
)
# Confirm that all the files exist
all(file.exists(unlist(data.tracks)))## [1] TRUE
The last part of the set-up phase is to define functions useful for this document.
ProcessDamID <- function(track, name, bins) {
# DamID is easy, as the scores are already in the same bins
damid <- read.table(track,
stringsAsFactors = FALSE,
col.names = c("seqnames", "start", "end", "score"))
damid <- as(damid, "GRanges")
start(damid) <- start(damid) + 1
# Scale
mcols(damid) <- scale(as(mcols(damid), "data.frame"))
# Return damid scores
damid$score
}
ProcessBigWig <- function(track, name, bins, replicates = c("r1", "r2"),
log2 = FALSE, pseudo = 0.1) {
# RepliSeq is easy, as the scores are already in the same bins - mostly
# Even more, there might be more replicates of repliseq, let's take the mean
# of those. I previously binned the histone modifications, meaning I can use
# the same function for them.
# Loop over replicates, adding the score to the bins
for (r in replicates) {
# Read repliseq replicate
repliseq <- import(gsub("r1", r, track))
# Add the score to the bins
ovl <- findOverlaps(bins, repliseq)
mcols(bins)[, r] <- NA
mcols(bins)[queryHits(ovl), r] <- repliseq$score[subjectHits(ovl)]
}
# Calculate the mean repliseq score
if (length(replicates) > 1) {
bins$score <- rowMeans(as(mcols(bins)[, replicates], "data.frame"),
na.rm = T)
} else {
bins$score <- mcols(bins)[, replicates]
}
if (log2) {
bins$score <- log2(bins$score + pseudo)
}
# Return the mean repliseq score
bins$score
}First, I will load the required input and prepare the data structure required.
# Loop over cells, reading their files
samples.df <- list()
LADs.norm <- list()
results <- list()
for (cell in cells) {
# 1) Samples data frame
samples.df <- c(samples.df,
list(readRDS(file.path(paste0(dir.input.base,
cell),
"samples_df.rds"))))
# 2) Normalized LAD scores
LADs.norm <- c(LADs.norm,
list(readRDS(file.path(paste0(dir.input.base,
cell),
"LADs_norm.rds"))))
# 3) Results differential tests
results <- c(results,
list(readRDS(file.path(paste0(dir.input.base,
cell),
"LADs_results.rds"))))
}
# Add names to the lists
names(samples.df) <- names(LADs.norm) <- names(results) <- cells
# Combine samples.df object
samples.df.combined <- data.frame(do.call(rbind, samples.df))
# Change numeric results to logical factor
conversion <- c("-1" = "down", "0" = "-", "1" = "up")
for (cell in cells) {
# Change the integers to factors
results.cell <- data.frame(results[[cell]])
results.copy <- data.frame(do.call(cbind,
lapply(1,
function(i) {
conversion[match(results.cell[, i],
as.numeric(names(conversion)))]
})))
names(results.copy) <- c("G2_G1")
# And add this to the LADs
mcols(LADs.norm[[cell]]) <- cbind(mcols(LADs.norm[[cell]]),
results.copy)
}Next, I want to load (bulk) pA-DamID scores for 20kb bins, for every data set. Also, calculate the combined score for all the LADs.
# Loop over the cells
padamid <- list()
for (cell in cells) {
# Load pA-DamID data, convert to GRanges
padamid.cell <- read.table(file.path(dir.padamid.base,
paste0(cell,
"_LMNB2-20kb-combined.norm.txt.gz")),
stringsAsFactors = FALSE,
col.names = c("seqnames", "start", "end", "score"))
padamid.cell <- as(padamid.cell, "GRanges")
# bed to R -> add 1 to the start
start(padamid.cell) <- start(padamid.cell) + 1
# Get the pA-DamID LAD score
# As with the other data, let's scale the data first
padamid.cell$score <- as.numeric(scale(padamid.cell$score))
padamid <- c(padamid, list(padamid.cell))
# Get the mean score for every LAD
LADs.cell <- LADs.norm[[cell]]
ovl <- findOverlaps(LADs.cell,
padamid.cell)
LADs.norm[[cell]]$padamid <- do.call(c, tapply(subjectHits(ovl),
queryHits(ovl),
function(x) mean(padamid.cell$score[x],
na.rm = T),
simplify = FALSE))
}
names(padamid) <- cellsFor every cell, for every data track I will now perform the analysis as described earlier. For a final figure, I will save the percentage of values higher than zero for every data set. A combination of those might be a nice way to visualize the final result.
First, I will read the files.
# Loop over the cells
for (cell in cells) {
# Loop over the tracks
data.tracks.cell <- data.tracks[[cell]]
track.names <- names(data.tracks.cell)
for (track.name in track.names) {
# Read the track, corresponding to the binned pA-DamID object
if (startsWith(track.name, "chip")) {
tmp <- ProcessBigWig(data.tracks.cell[[track.name]],
name = track.name,
bins = padamid.cell,
replicates = c("r1"))
} else if (startsWith(track.name, "damid") | startsWith(track.name, "padamid")) {
tmp <- ProcessDamID(data.tracks.cell[[track.name]],
name = track.name,
bins = padamid.cell)
} else if (track.name == "repliseq") {
tmp <- ProcessBigWig(data.tracks.cell[[track.name]],
name = track.name,
bins = padamid.cell)
}
# Add the data to the padamid object
mcols(padamid[[cell]])[, track.name] <- tmp
}
# Overlap with LADs
LADs.cell <- LADs.norm[[cell]]
ovl <- findOverlaps(LADs.cell,
padamid[[cell]])
# Get the mean score for every LAD
tmp <- do.call(rbind, tapply(subjectHits(ovl),
queryHits(ovl),
function(x) colMeans(as(mcols(padamid[[cell]])[x, track.names],
"data.frame"),
na.rm = T),
simplify = FALSE))
mcols(LADs.cell)[, track.names] <- tmp
LADs.norm[[cell]] <- LADs.cell
}Plot the progression throughout the cell cycle.
for (cell in cells) {
print(cell)
# Convert to df
df <- as(mcols(LADs.norm[[cell]]), "data.frame")
# Add ID
df$ID <- 1:nrow(df)
# Get the delta - using mean of t=0h
df <- cbind(df[, c("G2_G1", "ID")],
df[, samples.df.combined$name[samples.df.combined$cell == cell]])
df.melt <- melt(df, id.vars = c("G2_G1", "ID"))
df.melt$time <- gsub("_.*", "", gsub(paste0(cell, "_"), "", df.melt$variable))
df.melt$time <- factor(df.melt$time, levels = c("G1", "S", "G2"))
df.melt$G2_G1 <- factor(df.melt$G2_G1, levels = c("down", "-", "up"))
# Plot
plt <- ggplot(df.melt, aes(x = time, y = value, col = G2_G1, fill = G2_G1, group = G2_G1)) +
stat_summary(fun.y = mean, geom = "line", size = 1.5) +
stat_summary(fun.data = mean_sdl, geom = "ribbon", alpha = 0.25, col = NA,
fun.args = list(mult = 1)) +
ggtitle("pA-DamID dynamics over time") +
#facet_grid(. ~ result) +
xlab("Cell cycle phase") +
ylab("pA-DamID (log2)") +
scale_color_brewer(palette = "Set1", name = "result") +
scale_fill_brewer(palette = "Set1", name = "result") +
theme_bw() +
theme(aspect.ratio = 1)
plot(plt)
}## [1] "Hap1"
## [1] "HCT116"
## [1] "K562"
Looks good. Note that the LADs can still decrease a lot after S. Of course, this result is now combined with the S-specific biology and thus unreliable.
How do the dynamic LADs correlate with gene expression?
df.combined <- c()
for (cell in cells) {
print(cell)
# LAD status to genes
LADs <- LADs.norm[[cell]]
ovl <- findOverlaps(rnaseq, LADs, type = "within")
# Convert to df
df <- as(mcols(rnaseq), "data.frame")[, c("gene_id", "gene_name", cell)]
names(df)[3] <- "expr"
df$result <- factor("iLAD", levels = c(levels(LADs$G2_G1), "iLAD"))
df$result[queryHits(ovl)] <- LADs$G2_G1[subjectHits(ovl)]
# Complete cases
df <- df[complete.cases(df), ]
# Plot beeswarm
plt <- ggplot(df, aes(x = result, y = expr, col = result, group = result)) +
geom_quasirandom() +
geom_boxplot(outlier.shape = NA, fill = NA, col = "black", width = 0.3) +
ggtitle("Gene change versus gene expression") +
xlab("pA-DamID change") +
ylab("Gene expression (rlog2)") +
scale_color_brewer(palette = "Set1") +
theme_bw() +
theme(aspect.ratio = 1)
# plot(plt)
# What are active genes?
plt <- ggplot(df, aes(x = expr)) +
geom_histogram(binwidth = 1) +
geom_vline(xintercept = 7, col = "red", linetype = "dashed") +
xlab("Gene expression (rlog)") +
ylab("Count") +
ggtitle("Gene expression histogram") +
theme_bw() +
theme(aspect.ratio = 1)
# plot(plt)
# Let's say rlog > 7
cutoff <- 7
# Gene density
LADs$gene_density <- countOverlaps(LADs, rnaseq, type = "any") / width(LADs) * 1e6
LADs$gene_density.active <- countOverlaps(LADs, rnaseq[mcols(rnaseq)[, cell] > cutoff],
type = "any") / width(LADs) * 1e6
# Get the data frame
df <- as(mcols(LADs), "data.frame")
# df <- df[complete.cases(df), ]
# Plot beeswarm
plt <- ggplot(df, aes(x = G2_G1, y = gene_density, fill = G2_G1, group = G2_G1)) +
geom_boxplot(outlier.shape = NA) +
ggtitle("Gene change versus gene density") +
xlab("pA-DamID change") +
ylab("# genes / Mb") +
scale_color_brewer(palette = "Set1", name = "result") +
coord_cartesian(ylim = c(0, 18)) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# plot(plt)
plt <- ggplot(df, aes(x = G2_G1, y = gene_density.active, fill = G2_G1, group = G2_G1)) +
geom_boxplot(outlier.shape = NA) +
ggtitle("Gene change versus gene density") +
xlab("pA-DamID change") +
ylab("# active genes / Mb") +
scale_color_brewer(palette = "Set1", name = "result") +
coord_cartesian(ylim = c(0, 6)) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# plot(plt)
df$cell <- cell
df.combined <- rbind(df.combined,
df[, c("gene_density.active", "cell", "G2_G1")])
}## [1] "Hap1"
## [1] "HCT116"
## [1] "K562"
df.combined$cell <- factor(df.combined$cell, levels = c("Hap1", "HCT116", "K562"))
plt <- ggplot(df.combined, aes(x = cell, y = gene_density.active,
fill = G2_G1)) +
geom_boxplot(outlier.shape = NA, position = "dodge") +
ggtitle("Active gene density") +
xlab("") +
ylab("# active genes / Mb") +
scale_x_discrete(drop = FALSE) +
scale_fill_manual(values = brewer.pal(9, "Set1")[c(9, 1, 3)],
name = "result") +
coord_cartesian(ylim = c(0, 6)) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot(plt)Variable.
How are the replication timing scores for these changing domains?
df.combined <- c()
for (cell in cells) {
if (cell == "Hap1") {
next
}
print(cell)
LADs <- LADs.norm[[cell]]
# Get the data frame
df <- as(mcols(LADs), "data.frame")
df <- df[complete.cases(df), ]
# Get G1 and G2 scores
df$G1 <- rowMeans(df[, grep("G1", samples.df[[cell]]$name)])
df$G2 <- rowMeans(df[, grep("G2", samples.df[[cell]]$name)])
# Plot beeswarm
plt <- ggplot(df, aes(x = G2_G1, y = repliseq, fill = G2_G1)) +
geom_boxplot(outlier.shape = NA) +
ggtitle("Repliseq change versus gene expression") +
xlab("pA-DamID change") +
ylab("RepliSeq") +
scale_color_brewer(palette = "Set1", name = "result") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# plot(plt)
df$cell <- cell
df.combined <- rbind(df.combined,
df[, c("repliseq", "cell", "G2_G1",
"G1", "G2")])
}## [1] "HCT116"
## [1] "K562"
df.combined$cell <- factor(df.combined$cell, levels = c("Hap1", "HCT116", "K562"))
plt <- ggplot(df.combined, aes(x = cell, y = repliseq,
fill = G2_G1)) +
geom_boxplot(outlier.shape = NA, position = "dodge") +
ggtitle("Repliseq score") +
xlab("") +
ylab("repliseq") +
scale_x_discrete(drop = FALSE) +
scale_fill_manual(values = brewer.pal(9, "Set1")[c(9, 1, 3)],
name = "result") +
# coord_cartesian(ylim = c(0, 6)) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot(plt)# Also, I want to test if early-replicating DNA is always higher in G1 and
# late-replicating DNA is always higher in G2.
df.combined$class <- "mid"
df.combined$class[df.combined$repliseq > quantile(df.combined$repliseq, 0.8)] <- "early"
df.combined$class[df.combined$repliseq < quantile(df.combined$repliseq, 0.2)] <- "late"
df.combined$class <- factor(df.combined$class, levels = c("early", "mid", "late"))
plt <- ggplot(df.combined, aes(x = cell, y = G2 - G1,
fill = class)) +
geom_boxplot(outlier.shape = NA, position = "dodge") +
ggtitle("Repliseq score") +
xlab("") +
ylab("repliseq") +
scale_x_discrete(drop = FALSE) +
scale_fill_manual(values = brewer.pal(9, "Set1")[c(9, 1, 3)],
name = "result") +
# coord_cartesian(ylim = c(0, 6)) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot(plt)Variable.
Finally, let’s make some plots on the characteristics: LAD strength and LAD size.
df.combined <- c()
for (cell in cells) {
print(cell)
# Get all the data
df <- as(LADs.norm[[cell]], "data.frame")
# 1) Size distribution versus changes
plt <- ggplot(df, aes(x = G2_G1, y = width / 1e6, fill = G2_G1)) +
geom_boxplot(outlier.shape = NA) +
ggtitle("LAD size versus LAD change") +
xlab("pA-DamID change") +
ylab("LAD size (Mb)") +
scale_color_brewer(palette = "Set1", name = "result") +
coord_cartesian(ylim = c(0, 8)) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# plot(plt)
# 2) Initial strength
# Note: this can already be seen in the first plot
df$cell <- cell
df.combined <- rbind(df.combined,
df[, c("width", "cell", "G2_G1")])
}## [1] "Hap1"
## [1] "HCT116"
## [1] "K562"
df.combined$cell <- factor(df.combined$cell, levels = c("Hap1", "HCT116", "K562"))
plt <- ggplot(df.combined, aes(x = cell, y = width / 1e6,
fill = G2_G1)) +
geom_boxplot(outlier.shape = NA, position = "dodge") +
ggtitle("LAD size") +
xlab("") +
ylab("LAD size (Mb)") +
scale_x_discrete(drop = FALSE) +
scale_fill_manual(values = brewer.pal(9, "Set1")[c(9, 1, 3)],
name = "result") +
coord_cartesian(ylim = c(0, 8)) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot(plt)Distance to telomeres / centromeres.
# Read chrom sizes
chrom_sizes <- read.table("/DATA/usr/t.v.schaik/data/genomes/hg19/hg19_chrom_sizes.txt", sep = "\t")
row.names(chrom_sizes) <- chrom_sizes[, 1]
# Function to scale chromosome arms
ScaleChromosomeArms <- function(gr, chrom_sizes, centromeres.gr, inverse = FALSE) {
# For a given "gr" GRanges object, calculate the relative distance to a
# chromosome arm.
# Calculate distance to centromere for every gene
gr$distance_to_centromere <- mcols(distanceToNearest(gr, centromeres.gr))$distance
gr$distance_to_centromere <- as.numeric(gr$distance_to_centromere)
# For each chromosome
# if start of gene is left of the centromere,
# normalize the distance by the first base - start centromere distance
# else
# normalize by the distance by the end centromere - last base
for (chr in seqlevels(gr)) {
centromeres.chr <- centromeres.gr[seqnames(centromeres.gr) == chr]
left_arm <- start(centromeres.chr) - 1
right_arm <- chrom_sizes[chr, 2] - end(centromeres.chr)
idx <- seqnames(gr) %in% chr
gr.tmp <- gr[idx]
norm_dis <- ifelse(start(gr.tmp) < start(centromeres.chr),
gr.tmp$distance_to_centromere / left_arm,
gr.tmp$distance_to_centromere / right_arm)
# If inverse, inverse the distance (meaning towards telomeres)
if (inverse) {
norm_dis <- 1 - norm_dis
}
mcols(gr[idx])[, "distance_to_centromere"] <- norm_dis
}
# Return normalized distances
gr$distance_to_centromere
}
# Define telomeres & load centromeres
telomeres <- GRanges(seqnames = rep(seqlevels(LADs), times = 2),
ranges = IRanges(start = c(rep(1, length(seqlevels(LADs))),
seqlengths(LADs)),
end = c(rep(1, length(seqlevels(LADs))),
seqlengths(LADs))))
centromeres <- import("ts190708_differential_analysis_BinsandLADs_K562/centromeres.bed")
# Prep output df
df.combined <- c()
for (cell in cells) {
LADs <- LADs.norm[[cell]]
# Distance to telomeres data frame
LADs$distance_to_telomeres <- mcols(distanceToNearest(LADs, telomeres))$distance / 1e6
LADs$distance_to_centromeres <- mcols(distanceToNearest(LADs, centromeres))$distance / 1e6
LADs$distance_to_telomeres_scaled <- ScaleChromosomeArms(LADs, chrom_sizes,
centromeres, inverse = T)
LADs$seqnames <- seqnames(LADs)
# Add mean G1 / G2
mcols(LADs)[, "G1"] <- rowMeans(as(mcols(LADs), "data.frame")[, samples.df[[cell]]$name[samples.df[[cell]]$phase == "G1"]])
mcols(LADs)[, "G2"] <- rowMeans(as(mcols(LADs), "data.frame")[, samples.df[[cell]]$name[samples.df[[cell]]$phase == "G2"]])
mcols(LADs)[, "diff"] <- LADs$G2 - LADs$G1
# Add LAD size
mcols(LADs)[, "size"] <- width(LADs) / 1e6
df <- as(mcols(LADs), "data.frame")
df$cell <- cell
df.combined <- rbind(df.combined,
df[, c("distance_to_telomeres", "distance_to_centromeres",
"distance_to_telomeres_scaled", "G1", "G2", "diff",
"size", "cell", "G2_G1", "seqnames")])
}
df.combined$cell <- factor(df.combined$cell, levels = c("Hap1", "HCT116", "K562"))
# Order chromosomes by length
df.combined$seqnames <- factor(df.combined$seqnames,
levels = seqlevels(LADs)[order(seqlengths(LADs),
decreasing = T)])
# Plots
# 1) Telomere distance
plt <- ggplot(df.combined, aes(x = cell, y = distance_to_telomeres,
fill = G2_G1)) +
geom_boxplot(outlier.shape = NA, position = "dodge") +
ggtitle("Telomeres") +
xlab("") +
ylab("Distance to telomeres (Mb)") +
scale_x_discrete(drop = FALSE) +
scale_fill_manual(values = brewer.pal(9, "Set1")[c(9, 1, 3)],
name = "result") +
#coord_cartesian(ylim = c(0, 8)) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot(plt)# 2) Centromere distance
plt <- ggplot(df.combined, aes(x = cell, y = distance_to_centromeres,
fill = G2_G1)) +
geom_boxplot(outlier.shape = NA, position = "dodge") +
ggtitle("Centromeres") +
xlab("") +
ylab("Distance to centromeres (Mb)") +
scale_x_discrete(drop = FALSE) +
scale_fill_manual(values = brewer.pal(9, "Set1")[c(9, 1, 3)],
name = "result") +
#coord_cartesian(ylim = c(0, 8)) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot(plt)# Extra plots
# 1) Scaled chromosome arms to prevent the outliers chromosomes to account for
# differences in chromosome length
plt <- ggplot(df.combined, aes(x = cell, y = distance_to_telomeres_scaled,
fill = G2_G1)) +
geom_boxplot(outlier.shape = NA, position = "dodge") +
ggtitle("Telomeres - scaled") +
xlab("") +
ylab("Distance to telomeres - scaled") +
scale_x_discrete(drop = FALSE) +
scale_fill_manual(values = brewer.pal(9, "Set1")[c(9, 1, 3)],
name = "result") +
#coord_cartesian(ylim = c(0, 8)) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot(plt)# 2) Distribution of dynamic LADs across the chromosomes
plt <- ggplot(df.combined, aes(x = seqnames, fill = G2_G1)) +
geom_bar() +
ggtitle("Telomeres - scaled") +
xlab("") +
ylab("# Dynamic LADs") +
facet_grid(cell ~ .) +
#scale_x_discrete(drop = FALSE) +
scale_fill_manual(values = brewer.pal(9, "Set1")[c(9, 1, 3)],
name = "result") +
#coord_cartesian(ylim = c(0, 8)) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot(plt)# 3) Overkill: per-chromosome plots
plt <- ggplot(df.combined, aes(x = cell, y = distance_to_telomeres_scaled,
fill = G2_G1)) +
geom_boxplot(outlier.shape = NA, position = "dodge") +
ggtitle("Telomeres") +
xlab("") +
ylab("Distance to telomeres (Mb)") +
facet_wrap(~ seqnames) +
scale_x_discrete(drop = FALSE) +
scale_fill_manual(values = brewer.pal(9, "Set1")[c(9, 1, 3)],
name = "result") +
#coord_cartesian(ylim = c(0, 8)) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
aspect.ratio = 1)
plot(plt)# 4) LAD differences versus telomere distance
plt <- ggplot(df.combined, aes(x = distance_to_telomeres, y = diff)) +
geom_point(aes(col = seqnames), #alpha = 0.5,
size = 1) +
geom_smooth(method = "loess", se = F) +
geom_hline(yintercept = 0, col = "grey", linetype = "dashed") +
facet_grid(cell ~ .) +
ggtitle("Telomeres distance") +
xlab("Distance to telomeres (Mb)") +
ylab("Difference G2 - G1") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
aspect.ratio = 1)
plot(plt)# 5) LAD differences versus centromere distance
plt <- ggplot(df.combined, aes(x = distance_to_centromeres, y = diff)) +
geom_point(aes(col = seqnames), #alpha = 0.5,
size = 1) +
geom_smooth(method = "loess", se = F) +
geom_hline(yintercept = 0, col = "grey", linetype = "dashed") +
facet_grid(cell ~ .) +
ggtitle("Centromere distance") +
xlab("Distance to centromeres (Mb)") +
ylab("Difference G2 - G1") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
aspect.ratio = 1)
plot(plt)# 6) LAD differences versus centromere distance
plt <- ggplot(df.combined, aes(x = distance_to_telomeres_scaled, y = diff)) +
geom_point(aes(col = seqnames), #alpha = 0.5,
size = 1) +
geom_smooth(method = "loess", se = F) +
geom_hline(yintercept = 0, col = "grey", linetype = "dashed") +
facet_grid(cell ~ .) +
ggtitle("Telomeres distance") +
xlab("Distance to telomeres (Mb)") +
ylab("Difference G2 - G1") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
aspect.ratio = 1)
plot(plt)# 7) LAD differences versus telomere distance - colored by LAD size
plt <- ggplot(df.combined, aes(x = distance_to_telomeres, y = diff)) +
geom_point(aes(col = log2(size)), #alpha = 0.5,
size = 1) +
geom_smooth(method = "loess", se = F) +
geom_hline(yintercept = 0, col = "grey", linetype = "dashed") +
facet_grid(cell ~ .) +
ggtitle("Telomeres distance") +
xlab("Distance to telomeres (Mb)") +
ylab("Difference G2 - G1") +
scale_color_continuous(low = "black", high = "lightgrey") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
aspect.ratio = 1)
plot(plt)# 8) LAD differences versus telomere distance - colored by differential call
plt <- ggplot(df.combined, aes(x = distance_to_telomeres, y = diff)) +
geom_point(aes(col = G2_G1), #alpha = 0.5,
size = 1) +
geom_smooth(method = "loess", se = F) +
geom_hline(yintercept = 0, col = "grey", linetype = "dashed") +
facet_grid(cell ~ .) +
ggtitle("Telomeres distance") +
xlab("Distance to telomeres (Mb)") +
ylab("Difference G2 - G1") +
scale_color_manual(values = brewer.pal(9, "Set1")[c(9, 1, 3)],
name = "result") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
aspect.ratio = 1)
plot(plt)# 9) G1 scores near telomeres
plt <- ggplot(df.combined, aes(x = distance_to_telomeres, y = G1)) +
geom_point(aes(col = seqnames), #alpha = 0.5,
size = 1) +
geom_smooth(method = "loess", se = F) +
geom_hline(yintercept = 0, col = "grey", linetype = "dashed") +
facet_grid(cell ~ .) +
ggtitle("Telomeres distance") +
xlab("Distance to telomeres (Mb)") +
ylab("G1 score") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
aspect.ratio = 1)
plot(plt)plt <- ggplot(df.combined, aes(x = distance_to_telomeres, y = G2)) +
geom_point(aes(col = seqnames), #alpha = 0.5,
size = 1) +
geom_smooth(method = "loess", se = F) +
geom_hline(yintercept = 0, col = "grey", linetype = "dashed") +
facet_grid(cell ~ .) +
ggtitle("Telomeres distance") +
xlab("Distance to telomeres (Mb)") +
ylab("G2 score") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
aspect.ratio = 1)
plot(plt)# 9) scores near telomeres for every time points
df.combined.melt <- melt(df.combined,
measure.vars = c("G1", "G2"))
plt <- ggplot(df.combined.melt, aes(x = distance_to_telomeres, y = value)) +
geom_point(aes(col = seqnames), #alpha = 0.5,
size = 1) +
geom_smooth(method = "loess", se = F) +
geom_hline(yintercept = 0, col = "grey", linetype = "dashed") +
facet_grid(cell ~ variable) +
#ggtitle("Telomeres distance") +
xlab("Distance to telomeres (Mb)") +
ylab("scaled pA-DamID") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
aspect.ratio = 1)
plot(plt)# scores near centromeres
plt <- ggplot(df.combined.melt, aes(x = distance_to_centromeres, y = value)) +
geom_point(aes(col = seqnames), #alpha = 0.5,
size = 1) +
geom_smooth(method = "loess", se = F) +
geom_hline(yintercept = 0, col = "grey", linetype = "dashed") +
facet_grid(cell ~ variable) +
#ggtitle("Centromere distance") +
xlab("Distance to centromeres (Mb)") +
ylab("scaled pA-DamID") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
aspect.ratio = 1)
plot(plt)# No colors
plt <- ggplot(df.combined.melt, aes(x = distance_to_telomeres, y = value)) +
geom_point(size = 1) +
geom_smooth(method = "loess", se = F) +
geom_hline(yintercept = 0, col = "grey", linetype = "dashed") +
facet_grid(cell ~ variable) +
#ggtitle("Telomeres distance") +
xlab("Distance to telomeres (Mb)") +
ylab("scaled pA-DamID") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
aspect.ratio = 1)
plot(plt)# scores near centromeres
plt <- ggplot(df.combined.melt, aes(x = distance_to_centromeres, y = value)) +
geom_point(size = 1) +
geom_smooth(method = "loess", se = F) +
geom_hline(yintercept = 0, col = "grey", linetype = "dashed") +
facet_grid(cell ~ variable) +
#ggtitle("Centromere distance") +
xlab("Distance to centromeres (Mb)") +
ylab("scaled pA-DamID") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
aspect.ratio = 1)
plot(plt)# Load cLADs
cLADs <- readRDS("/DATA/usr/t.v.schaik/proj/3D_nucleus/results/ts180130_laminaVsNucleolus/ts180131_differential_localization/DamID_cLADs.rds")
# Prep output df
df.combined <- c()
for (cell in cells) {
LADs <- LADs.norm[[cell]]
ovl <- overlapsAny(cLADs, LADs[LADs$G2_G1 == "-"])
ovl.up <- overlapsAny(cLADs, LADs[LADs$G2_G1 == "up"])
ovl.down <- overlapsAny(cLADs, LADs[LADs$G2_G1 == "down"])
df <- data.frame(class = c("cAD", "ciAD", "fAD"),
all = as.vector(table(cLADs$calls[ovl])),
up = as.vector(table(cLADs$calls[ovl.up])),
down = as.vector(table(cLADs$calls[ovl.down])))
df[, 2:4] <- t(t(df[, 2:4]) / colSums(df[, 2:4]))
df <- melt(df, id.vars = "class")
df$cell <- cell
df.combined <- rbind(df.combined,
df[, c("class", "variable", "value", "cell")])
}
df.combined$cell <- factor(df.combined$cell, levels = c("Hap1", "HCT116", "K562"))
plt <- ggplot(df.combined, aes(x = variable, y = value, fill = class)) +
geom_bar(stat = "identity") +
ggtitle("cLAD enrichment") +
xlab("") +
ylab("Percentage") +
facet_grid(. ~ cell) +
scale_fill_brewer(palette = "Set2") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot(plt)It looks as if LAD size & telomere distance are important. Are these complementary: being a small LAD OR close to telomeres, either is fine.
# Prep output df
df.combined <- c()
for (cell in cells) {
LADs <- LADs.norm[[cell]]
# Distance to telomeres data frame
LADs$distance_to_telomeres <- mcols(distanceToNearest(LADs, telomeres))$distance / 1e6
LADs$distance_to_centromeres <- mcols(distanceToNearest(LADs, centromeres))$distance / 1e6
LADs$size <- width(LADs) / 1e6
df <- as(mcols(LADs), "data.frame")
df$cell <- cell
df.combined <- rbind(df.combined,
df[, c("distance_to_telomeres", "distance_to_centromeres", "size",
"cell", "G2_G1")])
}
df.combined$cell <- factor(df.combined$cell, levels = c("Hap1", "HCT116", "K562"))
df.combined <- df.combined[order(df.combined$G2_G1), ]
# Prepare a scatterplot
plt <- ggplot(df.combined, aes(x = distance_to_telomeres, y = size, color = G2_G1)) +
geom_point(size = 0.5, alpha = 1) +
ggtitle("cLAD enrichment") +
xlab("Distance to telomeres (Mb)") +
ylab("LAD size (Mb)") +
facet_grid(. ~ cell) +
scale_color_manual(values = brewer.pal(9, "Set1")[c(9, 1, 3)],
name = "result") +
theme_bw() +
theme(aspect.ratio = 1)
plot(plt)Important message: there is no correlation between DNA characteristics and lamina cell cycle dynamics. The main feature really is telomere positioning.
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] knitr_1.22 gtools_3.8.1 edgeR_3.26.0
## [4] limma_3.40.0 ggbeeswarm_0.6.0 RColorBrewer_1.1-2
## [7] reshape2_1.4.3 ggplot2_3.1.1 rtracklayer_1.44.0
## [10] GenomicRanges_1.36.0 GenomeInfoDb_1.20.0 IRanges_2.18.0
## [13] S4Vectors_0.22.0 BiocGenerics_0.30.0
##
## loaded via a namespace (and not attached):
## [1] Biobase_2.44.0 splines_3.6.1
## [3] Formula_1.2-3 assertthat_0.2.1
## [5] latticeExtra_0.6-28 GenomeInfoDbData_1.2.1
## [7] vipor_0.4.5 Rsamtools_2.0.0
## [9] yaml_2.2.0 pillar_1.3.1
## [11] backports_1.1.4 lattice_0.20-38
## [13] glue_1.3.1 digest_0.6.18
## [15] XVector_0.24.0 checkmate_1.9.3
## [17] colorspace_1.4-1 htmltools_0.3.6
## [19] Matrix_1.2-17 plyr_1.8.4
## [21] XML_3.98-1.19 pkgconfig_2.0.2
## [23] zlibbioc_1.30.0 purrr_0.3.2
## [25] scales_1.0.0 BiocParallel_1.18.0
## [27] tibble_2.1.1 htmlTable_1.13.1
## [29] withr_2.1.2 SummarizedExperiment_1.14.0
## [31] nnet_7.3-12 lazyeval_0.2.2
## [33] survival_2.44-1.1 magrittr_1.5
## [35] crayon_1.3.4 evaluate_0.13
## [37] foreign_0.8-71 beeswarm_0.2.3
## [39] data.table_1.12.2 tools_3.6.1
## [41] matrixStats_0.54.0 stringr_1.4.0
## [43] munsell_0.5.0 locfit_1.5-9.1
## [45] cluster_2.0.9 DelayedArray_0.10.0
## [47] Biostrings_2.52.0 compiler_3.6.1
## [49] rlang_0.3.4 grid_3.6.1
## [51] RCurl_1.95-4.12 rstudioapi_0.10
## [53] htmlwidgets_1.3 labeling_0.3
## [55] bitops_1.0-6 base64enc_0.1-3
## [57] rmarkdown_1.12 gtable_0.3.0
## [59] R6_2.4.0 GenomicAlignments_1.20.0
## [61] gridExtra_2.3 dplyr_0.8.0.1
## [63] Hmisc_4.2-0 stringi_1.4.3
## [65] Rcpp_1.0.1 rpart_4.1-15
## [67] acepack_1.4.1 tidyselect_0.2.5
## [69] xfun_0.6